{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "YEdilk144fzb"
   },
   "source": [
    "# Ungraded Lab: Training a Sarcasm Detection Model using a Convolution Layer\n",
    "\n",
    "You will be doing the same steps here as the previous lab but will be using a convolution layer instead. As usual, try tweaking the parameters and observe how it affects the results.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "LQ2W9qyVs2UN"
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import matplotlib.pyplot as plt\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "pmokcpHc5u1R"
   },
   "source": [
    "## Load the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "dxezdGoV29Yz"
   },
   "outputs": [],
   "source": [
    "# The dataset is already downloaded for you. For downloading you can use the code below.\n",
    "# !wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "BTcGA2Po2_nN"
   },
   "outputs": [],
   "source": [
    "# Load the JSON file\n",
    "with open(\"./sarcasm.json\", 'r') as f:\n",
    "    datastore = json.load(f)\n",
    "\n",
    "# Initialize the lists\n",
    "sentences = []\n",
    "labels = []\n",
    "\n",
    "# Collect sentences and labels into the lists\n",
    "for item in datastore:\n",
    "    sentences.append(item['headline'])\n",
    "    labels.append(item['is_sarcastic'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OD24pUiX_6ja"
   },
   "source": [
    "## Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "QrbbP-oP_5tx"
   },
   "outputs": [],
   "source": [
    "# Number of examples to use for training\n",
    "TRAINING_SIZE = 20000\n",
    "\n",
    "# Vocabulary size of the tokenizer\n",
    "VOCAB_SIZE = 10000\n",
    "\n",
    "# Maximum length of the padded sequences\n",
    "MAX_LENGTH = 32\n",
    "\n",
    "# Type of padding\n",
    "PADDING_TYPE = 'pre'\n",
    "\n",
    "# Specifies how to truncate the sequences\n",
    "TRUNC_TYPE = 'post'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F2zXSds45s2P"
   },
   "source": [
    "## Split the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "baDwTn9S3ENB"
   },
   "outputs": [],
   "source": [
    "# Split the sentences\n",
    "train_sentences = sentences[0:TRAINING_SIZE]\n",
    "test_sentences = sentences[TRAINING_SIZE:]\n",
    "\n",
    "# Split the labels\n",
    "train_labels = labels[0:TRAINING_SIZE]\n",
    "test_labels = labels[TRAINING_SIZE:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NdpLY-or5pTP"
   },
   "source": [
    "## Data preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "C2xJz4hLiW8-"
   },
   "outputs": [],
   "source": [
    "# Instantiate the vectorization layer\n",
    "vectorize_layer = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)\n",
    "\n",
    "# Generate the vocabulary based on the training inputs\n",
    "vectorize_layer.adapt(train_sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "667RxU6mikTo"
   },
   "outputs": [],
   "source": [
    "# Preprocess the train and test data\n",
    "train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences,train_labels))\n",
    "test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences,test_labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "DDaLeWnptKx2"
   },
   "outputs": [],
   "source": [
    "def preprocessing_fn(dataset):\n",
    "  '''Generates padded sequences from a tf.data.Dataset'''\n",
    "\n",
    "  # Apply the vectorization layer to the reviews\n",
    "  dataset_sequences = dataset.map(lambda review, label: (vectorize_layer(review), label))\n",
    "\n",
    "  # Put all elements in a single ragged batch\n",
    "  dataset_sequences = dataset_sequences.ragged_batch(batch_size=dataset_sequences.cardinality())\n",
    "\n",
    "  # Output a tensor from the single batch. Extract the sequences and labels.\n",
    "  sequences, labels = dataset_sequences.get_single_element()\n",
    "\n",
    "  # Pad the sequences\n",
    "  padded_sequences = tf.keras.utils.pad_sequences(sequences.numpy(), maxlen=MAX_LENGTH, truncating=TRUNC_TYPE, padding=PADDING_TYPE)\n",
    "\n",
    "  # Convert back to a tf.data.Dataset\n",
    "  padded_sequences = tf.data.Dataset.from_tensor_slices(padded_sequences)\n",
    "  labels = tf.data.Dataset.from_tensor_slices(labels)\n",
    "\n",
    "  # Combine the padded sequences and labels\n",
    "  dataset_vectorized = tf.data.Dataset.zip(padded_sequences, labels)\n",
    "\n",
    "  return dataset_vectorized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "7s4lQtlhtOEq"
   },
   "outputs": [],
   "source": [
    "# Preprocess the train and test data\n",
    "train_dataset_vectorized = train_dataset.apply(preprocessing_fn)\n",
    "test_dataset_vectorized = test_dataset.apply(preprocessing_fn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "7yD8d7YdtTYC"
   },
   "outputs": [],
   "source": [
    "# View 2 training sequences and its labels\n",
    "for example in train_dataset_vectorized.take(2):\n",
    "  print(example)\n",
    "  print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "nrvjR3wdizDn"
   },
   "outputs": [],
   "source": [
    "SHUFFLE_BUFFER_SIZE = 1000\n",
    "PREFETCH_BUFFER_SIZE = tf.data.AUTOTUNE\n",
    "BATCH_SIZE = 32\n",
    "\n",
    "# Optimize and batch the datasets for training\n",
    "train_dataset_final = (train_dataset_vectorized\n",
    "                       .cache()\n",
    "                       .shuffle(SHUFFLE_BUFFER_SIZE)\n",
    "                       .prefetch(PREFETCH_BUFFER_SIZE)\n",
    "                       .batch(BATCH_SIZE)\n",
    "                       )\n",
    "\n",
    "test_dataset_final = (test_dataset_vectorized\n",
    "                      .cache()\n",
    "                      .prefetch(PREFETCH_BUFFER_SIZE)\n",
    "                      .batch(BATCH_SIZE)\n",
    "                      )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "yQ_goiTa6Lay"
   },
   "source": [
    "## Plot Utility"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ikKN5Mqj6Kjz"
   },
   "outputs": [],
   "source": [
    "def plot_loss_acc(history):\n",
    "  '''Plots the training and validation loss and accuracy from a history object'''\n",
    "  acc = history.history['accuracy']\n",
    "  val_acc = history.history['val_accuracy']\n",
    "  loss = history.history['loss']\n",
    "  val_loss = history.history['val_loss']\n",
    "\n",
    "  epochs = range(len(acc))\n",
    "\n",
    "  fig, ax = plt.subplots(1,2, figsize=(12, 6))\n",
    "  ax[0].plot(epochs, acc, 'bo', label='Training accuracy')\n",
    "  ax[0].plot(epochs, val_acc, 'b', label='Validation accuracy')\n",
    "  ax[0].set_title('Training and validation accuracy')\n",
    "  ax[0].set_xlabel('epochs')\n",
    "  ax[0].set_ylabel('accuracy')\n",
    "  ax[0].legend()\n",
    "\n",
    "  ax[1].plot(epochs, loss, 'bo', label='Training Loss')\n",
    "  ax[1].plot(epochs, val_loss, 'b', label='Validation Loss')\n",
    "  ax[1].set_title('Training and validation loss')\n",
    "  ax[1].set_xlabel('epochs')\n",
    "  ax[1].set_ylabel('loss')\n",
    "  ax[1].legend()\n",
    "\n",
    "  plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HQBjPv_A5m1x"
   },
   "source": [
    "## Build and Compile the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "jGwXGIXvFhXW"
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "EMBEDDING_DIM = 16\n",
    "FILTERS = 128\n",
    "KERNEL_SIZE = 5\n",
    "DENSE_DIM = 6\n",
    "\n",
    "# Model Definition with Conv1D\n",
    "model_conv = tf.keras.Sequential([\n",
    "    tf.keras.Input(shape=(MAX_LENGTH,)),\n",
    "    tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM),\n",
    "    tf.keras.layers.Conv1D(FILTERS, KERNEL_SIZE, activation='relu'),\n",
    "    tf.keras.layers.GlobalMaxPooling1D(),\n",
    "    tf.keras.layers.Dense(DENSE_DIM, activation='relu'),\n",
    "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
    "])\n",
    "\n",
    "# Set the training parameters\n",
    "model_conv.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
    "\n",
    "# Print the model summary\n",
    "model_conv.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PcXC5QG45kM7"
   },
   "source": [
    "## Train the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "oB6C55FO3z3q"
   },
   "outputs": [],
   "source": [
    "NUM_EPOCHS = 10\n",
    "\n",
    "# Train the model\n",
    "history_conv = model_conv.fit(train_dataset_final, epochs=NUM_EPOCHS, validation_data=test_dataset_final)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "g9DC6dmLF8DC"
   },
   "outputs": [],
   "source": [
    "plot_loss_acc(history_conv)"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "C3_W3_Lab_6_sarcasm_with_1D_convolutional.ipynb",
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0rc1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
